This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
spotify <- read.csv("Popular_Spotify_Songs.csv")
head(spotify)
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
spotify <- read.csv("Popular_Spotify_Songs.csv")
head(spotify)
#spotify[cols_to_convert] <- lapply(spotify[cols_to_convert], function(x) as.numeric(as.character(x)))
spotify$streams <- as.numeric(spotify$streams)
G2;H2;Warningh: NAs introduced by coerciong
spotify$in_deezer_playlists = as.numeric(spotify$in_deezer_playlists)
G2;H2;Warningh: NAs introduced by coerciong
spotify$in_shazam_charts = as.numeric(spotify$in_shazam_charts)
G2;H2;Warningh: NAs introduced by coerciong
str(spotify[, 3:14])
'data.frame': 953 obs. of 12 variables:
$ artist_count : int 2 1 1 1 1 2 2 1 1 2 ...
$ released_year : int 2023 2023 2023 2019 2023 2023 2023 2023 2023 2023 ...
$ released_month : int 7 3 6 8 5 6 3 7 5 3 ...
$ released_day : int 14 23 30 23 18 1 16 7 15 17 ...
$ in_spotify_playlists: int 553 1474 1397 7858 3133 2186 3090 714 1096 2953 ...
$ in_spotify_charts : int 147 48 113 100 50 91 50 43 83 44 ...
$ streams : num 1.41e+08 1.34e+08 1.40e+08 8.01e+08 3.03e+08 ...
$ in_apple_playlists : int 43 48 94 116 84 67 34 25 60 49 ...
$ in_apple_charts : int 263 126 207 207 133 213 222 89 210 110 ...
$ in_deezer_playlists : num 45 58 91 125 87 88 43 30 48 66 ...
$ in_deezer_charts : int 10 14 14 12 15 17 13 13 11 13 ...
$ in_shazam_charts : num 826 382 949 548 425 946 418 194 953 339 ...
pairs(spotify[, 3:14], main = "Linear Relationships Between Metrics")
# Select only the relevant columns
selected_data <- spotify[, c("streams", "in_spotify_playlists", "in_deezer_playlists", "in_apple_playlists")]
# Create the pairs plot
pairs(selected_data, main = "Pairs Plot of Playlist Counts and Streams")
colSums(is.na(spotify))
track_name artist.s._name artist_count released_year released_month released_day in_spotify_playlists
0 0 0 0 0 0 0
in_spotify_charts streams in_apple_playlists in_apple_charts in_deezer_playlists in_deezer_charts in_shazam_charts
0 1 0 0 79 0 57
bpm key mode danceability_. valence_. energy_. acousticness_.
0 0 0 0 0 0 0
instrumentalness_. liveness_. speechiness_.
0 0 0
dim(spotify)
[1] 953 24
library(ggplot2)
ggplot(spotify, aes(x = released_year)) +
geom_histogram(binwidth = 1, fill = "skyblue", color = "white") +
labs(title = "Distribution of Streams", x = names(spotify$released_year))
plot(density(spotify$released_year, na.rm = TRUE), main = "Density Plot of Released Year", xlab = "Released Year", col = "blue", lwd = 2)
View(spotify)
# Basic scatter plot with color based on 'mode'
ggplot(spotify, aes(x = streams, y = in_spotify_playlists, color = mode)) +
geom_point() +
labs(title = "Streams vs Playlist Metrics by Mode",
x = "Streams",
y = "Number in Spotify Playlists") +
theme_minimal()
library(shiny)
library(ggplot2)
# UI
ui <- fluidPage(
titlePanel("Streams vs Spotify Playlists by Mode"),
sidebarLayout(
sidebarPanel(
checkboxGroupInput("mode_select", "Select Mode(s):",
choices = unique(spotify$mode),
selected = unique(spotify$mode))
),
mainPanel(
plotOutput("scatterPlot")
)
)
)
# Server
server <- function(input, output) {
output$scatterPlot <- renderPlot({
filtered_data <- spotify[spotify$mode %in% input$mode_select, ]
ggplot(filtered_data, aes(x = streams, y = in_spotify_playlists, color = mode)) +
geom_point() +
labs(
title = "Streams vs Playlist Metrics by Mode",
x = "Streams",
y = "Number in Spotify Playlists"
) +
theme_minimal()
})
}
# Run the app
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:4945
g
library(dplyr)
# Create a combined label of Song + Artist
spotify <- spotify %>%
mutate(song.artist = paste(track_name, "-", artist.s._name))
view(spotify)
G1;H1;Errorh in view(spotify) : could not find function "view"
Error during wrapup: not that many frames on the stack
Error: no more error handlers available (recursive errors?); invoking 'abort' restart
g
yearly_top_song <- spotify %>%
group_by(released_year) %>%
slice_max(order_by = streams, n = 1, with_ties = TRUE) %>%
ungroup()
# Step 1: Get the top 10 songs by total streams
top10_yearly <- yearly_top_song %>%
arrange(desc(streams)) %>%
slice(1:10)
top10_yearly
# Convert song_artist to factor with levels ordered by Streams
top10_yearly <- top10_yearly %>%
arrange(desc(streams)) %>%
mutate(song.artist = factor(song.artist, levels = unique(song.artist)))
top10_yearly
ggplot(top10_yearly, aes(x = released_year, y = streams, fill = factor(song.artist))) +
geom_bar(stat = "identity") +
labs(title = "Top Streamed Songs per Year",
x = "Track (Song - Artist)",
y = "Number of Streams",
fill = "Year") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
ggplot(top10_yearly, aes(x = factor(released_year), y = streams, fill = song.artist)) +
geom_bar(stat = "identity") +
labs(title = "Top Streamed Song For Each Released Year: Top 10 Ranking Overall",
x = "Year",
y = "Streams",
fill = "Song - Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
yearly_top_songs <- spotify %>%
group_by(released_year) %>%
slice_max(order_by = streams, n = 5, with_ties = TRUE) %>%
ungroup()
View(yearly_top_songs)
# Filter for 2023 top 5 songs from your previously filtered data
top_2023 <- yearly_top_songs %>%
filter(released_year == 2023)
# Create the bar chart
ggplot(top_2023, aes(x = reorder(track_name, -streams), y = streams, fill = song.artist)) +
geom_bar(stat = "identity") +
labs(title = "Top 5 Streamed Songs in 2023",
x = "Song",
y = "Streams",
fill = "Song - Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Check how many rows are in the dataset for 2022
dim(yearly_top_songs[yearly_top_songs$released_year == 2022, ])
[1] 5 25
# Check for NA or invalid values in 2022
summary(yearly_top_songs[yearly_top_songs$released_year == 2022, ])
track_name artist.s._name artist_count released_year released_month released_day in_spotify_playlists in_spotify_charts
Length:5 Length:5 Min. :1.0 Min. :2022 Min. :3.0 Min. : 6.0 Min. : 8506 Min. : 42.0
Class :character Class :character 1st Qu.:1.0 1st Qu.:2022 1st Qu.:5.0 1st Qu.: 6.0 1st Qu.: 8576 1st Qu.: 42.0
Mode :character Mode :character Median :2.0 Median :2022 Median :5.0 Median : 6.0 Median : 8870 Median : 43.0
Mean :1.6 Mean :2022 Mean :5.8 Mean :14.2 Mean :11713 Mean : 60.4
3rd Qu.:2.0 3rd Qu.:2022 3rd Qu.:7.0 3rd Qu.:22.0 3rd Qu.: 9037 3rd Qu.: 45.0
Max. :2.0 Max. :2022 Max. :9.0 Max. :31.0 Max. :23575 Max. :130.0
streams in_apple_playlists in_apple_charts in_deezer_playlists in_deezer_charts in_shazam_charts bpm key
Min. :1.231e+09 Min. : 94.0 Min. : 65.0 Min. :139.0 Min. :14.0 Min. : 49.0 Min. : 92.0 Length:5
1st Qu.:1.264e+09 1st Qu.:104.0 1st Qu.:108.0 1st Qu.:141.0 1st Qu.:14.0 1st Qu.:127.8 1st Qu.:107.0 Class :character
Median :1.357e+09 Median :124.0 Median :120.0 Median :164.0 Median :26.0 Median :160.0 Median :128.0 Mode :character
Mean :1.561e+09 Mean :188.2 Mean :124.8 Mean :327.6 Mean :25.2 Mean :136.2 Mean :126.4
3rd Qu.:1.441e+09 3rd Qu.:216.0 3rd Qu.:133.0 3rd Qu.:331.0 3rd Qu.:26.0 3rd Qu.:168.5 3rd Qu.:131.0
Max. :2.513e+09 Max. :403.0 Max. :198.0 Max. :863.0 Max. :46.0 Max. :176.0 Max. :174.0
NA's :1
mode danceability_. valence_. energy_. acousticness_. instrumentalness_. liveness_. speechiness_. song.artist
Length:5 Min. :52.0 Min. :19.0 Min. :47.0 Min. : 1 Min. :0.0 Min. : 9.0 Min. : 4.0 Length:5
Class :character 1st Qu.:62.0 1st Qu.:24.0 1st Qu.:71.0 1st Qu.: 1 1st Qu.:0.0 1st Qu.:13.0 1st Qu.: 6.0 Class :character
Mode :character Median :65.0 Median :43.0 Median :72.0 Median : 9 Median :0.0 Median :23.0 Median : 8.0 Mode :character
Mean :68.2 Mean :41.4 Mean :68.2 Mean :11 Mean :0.6 Mean :20.6 Mean :10.4
3rd Qu.:71.0 3rd Qu.:55.0 3rd Qu.:73.0 3rd Qu.:10 3rd Qu.:0.0 3rd Qu.:27.0 3rd Qu.: 9.0
Max. :91.0 Max. :66.0 Max. :78.0 Max. :34 Max. :3.0 Max. :31.0 Max. :25.0
# Alternatively, print it to inspect
print(yearly_top_songs[yearly_top_songs$released_year == 2022, ])
# Look at 2022 data closely
spotify %>%
filter(released_year == 2022) %>%
select(track_name, artist.s._name, streams) %>%
glimpse()
Rows: 402
Columns: 3
$ track_name <chr> "As It Was", "Kill Bill", "Calm Down (with Selena Gomez)", "Creepin'", "Anti-Hero", "I'm Good (Blue)", "I Ain't Worried", "La …
$ artist.s._name <chr> "Harry Styles", "SZA", "R��ma, Selena G", "The Weeknd, 21 Savage, Metro Boomin", "Taylor Swift", "Bebe Rexha, David Guetta", "…
$ streams <dbl> 2513188493, 1163093654, 899183384, 843957510, 999748277, 1109433169, 1085685420, 1214083358, 720434240, 674072710, 404562836, …
# Count unique songs to see if there's a tie issue
yearly_top_songs %>%
filter(released_year == 2022) %>%
count(track_name)
# Check for NAs in streams or grouping variables
yearly_top_songs %>%
filter(released_year == 2022) %>%
summarise(
missing_streams = sum(is.na(streams)),
missing_track = sum(is.na(track_name)),
missing_artist = sum(is.na(song.artist)))
# Shiny app to view top streamed songs by year with a toggle
library(shiny)
library(dplyr)
library(ggplot2)
# UI
ui <- fluidPage(
titlePanel("Top Streamed Songs by Year"),
sidebarLayout(
sidebarPanel(
selectInput("year", "Select Year:", choices = sort(unique(yearly_top_songs$released_year)))
),
mainPanel(
plotOutput("topSongsPlot")
)
)
)
# Server
server <- function(input, output, session) {
output$topSongsPlot <- renderPlot({
selected_year_data <- yearly_top_songs %>%
filter(released_year == input$year)
# Ensure no invalid characters or encoding issues in track names
selected_year_data$track_name <- iconv(selected_year_data$track_name, from = "UTF-8", to = "UTF-8", sub = "*")
ggplot(selected_year_data, aes(x = reorder(track_name, -streams), y = streams, fill = song.artist)) +
geom_bar(stat = "identity") +
labs(title = paste("Top 5 Streamed Songs in", input$year),
x = "Song",
y = "Number of Streams",
fill = "Song & Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
})
}
# Run app
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:4945
gG1;H1;Error during wrapuph: not that many frames on the stack
Error: no more error handlers available (recursive errors?); invoking 'abort' restart
gG1;H1;Error during wrapuph: not that many frames on the stack
Error: no more error handlers available (recursive errors?); invoking 'abort' restart
g
view(top_2023)
G1;H1;Errorh in view(top_2023) : could not find function "view"
Error during wrapup: not that many frames on the stack
Error: no more error handlers available (recursive errors?); invoking 'abort' restart
g
names(top_2023)
[1] "track_name" "artist.s._name" "artist_count" "released_year" "released_month" "released_day"
[7] "in_spotify_playlists" "in_spotify_charts" "streams" "in_apple_playlists" "in_apple_charts" "in_deezer_playlists"
[13] "in_deezer_charts" "in_shazam_charts" "bpm" "key" "mode" "danceability_."
[19] "valence_." "energy_." "acousticness_." "instrumentalness_." "liveness_." "speechiness_."
[25] "song.artist"
# Load packages explicitly
library(dplyr)
library(tidyr)
# Now explicitly call dplyr::select() to avoid masking
top_2023_features <- top_2023 %>%
dplyr::select(
song.artist,
bpm,
`danceability_.`,
`speechiness_.`,
`energy_.`,
`acousticness_.`
) %>%
pivot_longer(cols = -song.artist, names_to = "feature", values_to = "value")
# Step 2: Create circular barplot
ggplot(top_2023_features, aes(x = feature, y = value, fill = song.artist)) +
geom_bar(stat = "identity", position = "dodge") +
coord_polar() +
labs(title = "Audio Feature Metrics for Top 5 Songs in 2023",
x = "",
y = "",
fill = "Song - Artist") +
theme_minimal() +
theme(axis.text.y = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(size = 12, face = "bold"))
yearly_top3_songs <- spotify %>%
group_by(released_year) %>%
slice_max(order_by = streams, n = 3, with_ties = TRUE) %>%
ungroup()
View(yearly_top3_songs)
# Filter for 2023 top 3 songs from your previously filtered data
top3_2023 <- yearly_top3_songs %>%
filter(released_year == 2023)
head(top3_2023)
# Now explicitly call dplyr::select() to avoid masking
top3_2023_features <- top3_2023 %>%
dplyr::select(
song.artist,
bpm,
`danceability_.`,
`speechiness_.`,
`energy_.`,
`acousticness_.`
) %>%
pivot_longer(cols = -song.artist, names_to = "feature", values_to = "value")
head(top3_2023)
# Step 2: Create circular barplot
ggplot(top3_2023_features, aes(x = feature, y = value, fill = song.artist)) +
geom_bar(stat = "identity", position = "dodge") +
coord_polar() +
labs(title = "Audio Feature Metrics for Top 5 Songs in 2023",
x = "",
y = "",
fill = "Track Name") +
theme_minimal() +
theme(axis.text.y = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(size = 12, face = "bold"))
# Load clean libraries (force reloading if needed)
library(dplyr)
library(tibble)
library(tidyr)
library(fmsb)
G2;H2;Warningh: package ‘fmsb’ was built under R version 4.3.3g
G3;Registered S3 methods overwritten by 'fmsb':
method from
print.roc pROC
plot.roc pROC
g
library(scales)
G2;H2;Warningh: package ‘scales’ was built under R version 4.3.3g
# Clean column names using backticks explicitly
top_2023_clean <- top_2023 %>%
dplyr::rename(
danceability = `danceability_.`,
speechiness = `speechiness_.`,
energy = `energy_.`,
acousticness = `acousticness_.`
)
# Select only relevant columns
radar_data <- dplyr::select(top_2023_clean, song.artist, bpm, danceability, speechiness, energy, acousticness)
# Normalize the metrics to range [0, 1]
radar_data_norm <- radar_data %>%
mutate(across(where(is.numeric) & !song.artist, ~ scales::rescale(.x, to = c(0, 1))))
# Create max and min rows for required radar structure
max_min <- data.frame(
bpm = 1, danceability = 1, speechiness = 1, energy = 1, acousticness = 1,
row.names = c("Max")
) %>%
bind_rows(data.frame(
bpm = 0, danceability = 0, speechiness = 0, energy = 0, acousticness = 0,
row.names = c("Min")
))
# Add the song data with rownames as song titles
radar_matrix <- bind_rows(
max_min,
radar_data_norm %>% column_to_rownames("song.artist")
)
# Assign colors per song
colors_border <- rainbow(nrow(radar_matrix) - 2)
colors_in <- adjustcolor(colors_border, alpha.f = 0.25)
# Plot
fmsb::radarchart(
radar_matrix,
axistype = 1,
pcol = colors_border,
pfcol = colors_in,
plwd = 2,
plty = 1,
cglcol = "grey",
cglty = 1,
axislabcol = "grey",
caxislabels = seq(0, 1, 0.2),
cglwd = 0.8,
vlcex = 0.9,
title = "Top 5 Songs in 2023 — Audio Features Radar Chart"
)
legend(
"topright",
legend = rownames(radar_matrix)[-c(1, 2)],
bty = "n",
pch = 20,
col = colors_border,
text.col = "black",
cex = 0.8
)
# Clean column names using backticks explicitly
top3_2023_clean <- top3_2023 %>%
dplyr::rename(
danceability = `danceability_.`,
speechiness = `speechiness_.`,
energy = `energy_.`,
acousticness = `acousticness_.`
)
# Select only relevant columns
radar_data3 <- dplyr::select(top3_2023_clean, song.artist, bpm, danceability, speechiness, energy, acousticness)
# Normalize the metrics to range [0, 1]
radar_data_norm3 <- radar_data3 %>%
mutate(across(where(is.numeric) & !song.artist, ~ scales::rescale(.x, to = c(0, 1))))
# Create max and min rows for required radar structure
max_min3 <- data.frame(
bpm = 1, danceability = 1, speechiness = 1, energy = 1, acousticness = 1,
row.names = c("Max")
) %>%
bind_rows(data.frame(
bpm = 0, danceability = 0, speechiness = 0, energy = 0, acousticness = 0,
row.names = c("Min")
))
# Add the song data with rownames as song titles
radar_matrix3 <- bind_rows(
max_min3,
radar_data_norm3 %>% column_to_rownames("song.artist")
)
# Assign colors per song
colors_border <- rainbow(nrow(radar_matrix) - 2)
colors_in <- adjustcolor(colors_border, alpha.f = 0.25)
# Plot
fmsb::radarchart(
radar_matrix3,
axistype = 1,
pcol = colors_border,
pfcol = colors_in,
plwd = 2,
plty = 1,
cglcol = "grey",
cglty = 1,
axislabcol = "grey",
caxislabels = seq(0, 1, 0.2),
cglwd = 0.8,
vlcex = 0.9,
title = "Top 3 Songs in 2023 — Audio Features Radar Chart"
)
legend(
"topright",
legend = rownames(radar_matrix3)[-c(1, 2)],
bty = "n",
pch = 20,
col = colors_border,
text.col = "black",
cex = 0.8,
inset = c(-0.1, 0) # Moves the legend to the right (negative x-inset)
)
library(dplyr)
library(tidyr)
library(fmsb)
library(scales)
library(shiny)
# UI ----
ui <- fluidPage(
titlePanel("Radar Chart of Top 3 Spotify Songs by Year"),
sidebarLayout(
sidebarPanel(
selectInput("selected_year", "Choose a Year:",
choices = NULL)
),
mainPanel(
plotOutput("radarPlot")
)
)
)
# Server ----
server <- function(input, output, session) {
# Populate dropdown with available years
observe({
updateSelectInput(session, "selected_year",
choices = sort(unique(spotify$released_year), decreasing = TRUE),
selected = max(spotify$released_year, na.rm = TRUE))
})
# Reactive: create radar matrix for selected year
radar_matrix3 <- reactive({
req(input$selected_year)
# Top 3 songs for selected year
top3_year <- spotify %>%
filter(released_year == input$selected_year) %>%
slice_max(order_by = streams, n = 3, with_ties = FALSE) %>%
dplyr::rename(
danceability = `danceability_.`,
speechiness = `speechiness_.`,
energy = `energy_.`,
acousticness = `acousticness_.`
) %>%
dplyr::select(song.artist, bpm, danceability, speechiness, energy, acousticness)
# Normalize numeric columns to 0–1
radar_data_norm3 <- top3_year %>%
mutate(across(where(is.numeric), ~ scales::rescale(.x, to = c(0, 1))))
if (nrow(radar_data_norm3) < 1) return(NULL)
# Create max/min rows
max_min3 <- data.frame(
bpm = 1, danceability = 1, speechiness = 1, energy = 1, acousticness = 1,
row.names = c("Max")
) %>%
bind_rows(data.frame(
bpm = 0, danceability = 0, speechiness = 0, energy = 0, acousticness = 0,
row.names = c("Min")
))
# Bind normalized data, set rownames to song.artist
final_matrix <- bind_rows(
max_min3,
radar_data_norm3 %>% column_to_rownames("song.artist")
)
return(final_matrix)
})
# Plot output ----
output$radarPlot <- renderPlot({
matrix <- radar_matrix3()
req(matrix)
# Colors
colors_border <- rainbow(nrow(matrix) - 2)
colors_in <- adjustcolor(colors_border, alpha.f = 0.25)
# Set up a two-row layout: chart on top, legend below
layout(matrix(c(1, 2), nrow = 2), heights = c(4, 1)) # Top: 4x height, Bottom: 1x height
# Top: Radar Chart
par(mar = c(2, 2, 4, 2)) # reasonable margins
fmsb::radarchart(
matrix,
axistype = 1,
pcol = colors_border,
pfcol = colors_in,
plwd = 2,
plty = 1,
cglcol = "grey",
cglty = 1,
axislabcol = "grey",
caxislabels = seq(0, 1, 0.2),
cglwd = 0.8,
vlcex = 0.9,
title = paste("Top 3 Songs in", input$selected_year, "— Audio Features Radar Chart")
)
# Bottom: Legend
par(mar = c(0, 0, 0, 0)) # no margins
plot.new()
legend(
"center",
legend = rownames(matrix)[-c(1, 2)],
bty = "n",
pch = 20,
col = colors_border,
text.col = "black",
cex = 0.9,
ncol = 1, # You can change to 2+ if you want columns
xpd = TRUE
)
})
}
# Run the app ----
shinyApp(ui, server)
G3;
Listening on http://127.0.0.1:6557
g
# Select only numeric columns
numeric_cols <- spotify %>%
select(where(is.numeric))
numeric_cols
# Calculate correlation of all numeric columns with 'streams'
correlations <- cor(numeric_cols, use = "complete.obs")
correlations
artist_count released_year released_month released_day in_spotify_playlists in_spotify_charts streams in_apple_playlists
artist_count 1.000000000 0.061445644 0.009720347 -0.044766245 -0.0746868039 -0.002421656 -0.1090468634 -0.008712241
released_year 0.061445644 1.000000000 0.031372926 0.160042169 -0.3305741123 0.100988420 -0.1483509269 -0.155648773
released_month 0.009720347 0.031372926 1.000000000 -0.015820232 -0.0187633639 -0.031526077 0.0413240023 0.007380967
released_day -0.044766245 0.160042169 -0.015820232 1.000000000 -0.0320967227 0.042203010 0.0410748362 0.028622345
in_spotify_playlists -0.074686804 -0.330574112 -0.018763364 -0.032096723 1.0000000000 0.173307807 0.7650951338 0.709922084
in_spotify_charts -0.002421656 0.100988420 -0.031526077 0.042203010 0.1733078066 1.000000000 0.2454749140 0.213322335
streams -0.109046863 -0.148350927 0.041324002 0.041074836 0.7650951338 0.245474914 1.0000000000 0.663657168
in_apple_playlists -0.008712241 -0.155648773 0.007380967 0.028622345 0.7099220838 0.213322335 0.6636571679 1.000000000
in_apple_charts -0.079655066 0.007650642 -0.010603129 0.009855360 0.2087053786 0.565321488 0.2508103705 0.322358302
in_deezer_playlists -0.073406225 -0.265234545 -0.035433532 -0.041980554 0.7880546875 0.151785663 0.7185929567 0.645914528
in_deezer_charts 0.022218537 0.103287112 -0.001921194 0.063555630 0.1952907401 0.558419963 0.2594696320 0.409688235
in_shazam_charts -0.031812269 0.054492378 -0.090799317 0.040728906 0.1111503800 0.594678886 0.0587456970 0.187401561
bpm -0.067047448 -0.041957657 -0.051936323 -0.048020996 0.0260085534 0.028010413 0.0327164251 0.044415122
danceability_. 0.209581804 0.192054100 -0.034978955 0.076211130 -0.1066197808 0.075249362 -0.0754316227 0.011504320
valence_. 0.120784211 -0.064812792 -0.118074232 0.071279071 -0.0552336199 0.056602171 -0.0584550791 0.053187299
energy_. 0.149966302 0.130105474 -0.081977712 0.064572106 -0.0494256700 0.104328458 -0.0496657926 0.074416649
acousticness_. -0.101620287 -0.169751059 0.039266560 -0.010279631 0.0001543819 -0.078095007 0.0013286969 -0.088265650
instrumentalness_. -0.052814944 -0.014754771 0.031122232 0.007126726 0.0121080272 -0.012565007 -0.0009670221 -0.045488723
liveness_. 0.041035230 0.007441171 -0.017825352 0.002619619 -0.0339739648 -0.039153639 -0.0387277529 -0.046255149
speechiness_. 0.117955768 0.126711891 0.030599526 -0.017347379 -0.0719087372 -0.086192083 -0.0907281501 -0.101941835
in_apple_charts in_deezer_playlists in_deezer_charts in_shazam_charts bpm danceability_. valence_. energy_.
artist_count -0.079655066 -0.073406225 0.022218537 -0.031812269 -0.0670474482 0.209581804 0.120784211 0.149966302
released_year 0.007650642 -0.265234545 0.103287112 0.054492378 -0.0419576571 0.192054100 -0.064812792 0.130105474
released_month -0.010603129 -0.035433532 -0.001921194 -0.090799317 -0.0519363227 -0.034978955 -0.118074232 -0.081977712
released_day 0.009855360 -0.041980554 0.063555630 0.040728906 -0.0480209963 0.076211130 0.071279071 0.064572106
in_spotify_playlists 0.208705379 0.788054688 0.195290740 0.111150380 0.0260085534 -0.106619781 -0.055233620 -0.049425670
in_spotify_charts 0.565321488 0.151785663 0.558419963 0.594678886 0.0280104129 0.075249362 0.056602171 0.104328458
streams 0.250810371 0.718592957 0.259469632 0.058745697 0.0327164251 -0.075431623 -0.058455079 -0.049665793
in_apple_playlists 0.322358302 0.645914528 0.409688235 0.187401561 0.0444151222 0.011504320 0.053187299 0.074416649
in_apple_charts 1.000000000 0.198692411 0.356675982 0.443346418 0.0512089175 -0.003976097 0.061427394 0.153590558
in_deezer_playlists 0.198692411 1.000000000 0.218281108 0.135919298 0.0453831408 -0.104850821 -0.025849620 -0.028605485
in_deezer_charts 0.356675982 0.218281108 1.000000000 0.374829138 0.0370517105 0.087187954 0.075155386 0.108571701
in_shazam_charts 0.443346418 0.135919298 0.374829138 1.000000000 0.0891578410 -0.010179394 -0.003080391 0.095095549
bpm 0.051208918 0.045383141 0.037051710 0.089157841 1.0000000000 -0.140710959 0.050484657 0.003536259
danceability_. -0.003976097 -0.104850821 0.087187954 -0.010179394 -0.1407109592 1.000000000 0.390335848 0.186243358
valence_. 0.061427394 -0.025849620 0.075155386 -0.003080391 0.0504846571 0.390335848 1.000000000 0.354253808
energy_. 0.153590558 -0.028605485 0.108571701 0.095095549 0.0035362587 0.186243358 0.354253808 1.000000000
acousticness_. -0.105083100 0.028837909 -0.043917700 -0.071673565 -0.0020473755 -0.239007880 -0.068070884 -0.554771840
instrumentalness_. -0.010658818 0.021617457 -0.002299823 -0.015732282 -0.0009552758 -0.098154216 -0.136058212 -0.032914831
liveness_. -0.001551996 -0.005142997 0.002914949 -0.045209630 0.0005645641 -0.093272303 0.016319569 0.120967010
speechiness_. -0.157645853 -0.108361699 -0.073955127 -0.081685578 0.0247134810 0.173420342 0.036580343 -0.017125796
acousticness_. instrumentalness_. liveness_. speechiness_.
artist_count -0.1016202867 -0.0528149443 0.0410352297 0.11795577
released_year -0.1697510593 -0.0147547713 0.0074411712 0.12671189
released_month 0.0392665600 0.0311222324 -0.0178253521 0.03059953
released_day -0.0102796313 0.0071267258 0.0026196188 -0.01734738
in_spotify_playlists 0.0001543819 0.0121080272 -0.0339739648 -0.07190874
in_spotify_charts -0.0780950070 -0.0125650073 -0.0391536392 -0.08619208
streams 0.0013286969 -0.0009670221 -0.0387277529 -0.09072815
in_apple_playlists -0.0882656498 -0.0454887232 -0.0462551494 -0.10194183
in_apple_charts -0.1050831002 -0.0106588177 -0.0015519961 -0.15764585
in_deezer_playlists 0.0288379089 0.0216174566 -0.0051429975 -0.10836170
in_deezer_charts -0.0439176997 -0.0022998235 0.0029149486 -0.07395513
in_shazam_charts -0.0716735649 -0.0157322822 -0.0452096305 -0.08168558
bpm -0.0020473755 -0.0009552758 0.0005645641 0.02471348
danceability_. -0.2390078796 -0.0981542162 -0.0932723026 0.17342034
valence_. -0.0680708838 -0.1360582123 0.0163195694 0.03658034
energy_. -0.5547718398 -0.0329148310 0.1209670100 -0.01712580
acousticness_. 1.0000000000 0.0332206982 -0.0406689579 -0.02387702
instrumentalness_. 0.0332206982 1.0000000000 -0.0488636800 -0.08664221
liveness_. -0.0406689579 -0.0488636800 1.0000000000 -0.04518074
speechiness_. -0.0238770164 -0.0866422067 -0.0451807367 1.00000000
# Sort and view
sort(cor_with_streams, decreasing = TRUE)
streams in_spotify_playlists in_deezer_playlists in_apple_playlists in_deezer_charts in_apple_charts in_spotify_charts
1.0000000000 0.7650951338 0.7185929567 0.6636571679 0.2594696320 0.2508103705 0.2454749140
in_shazam_charts released_month released_day bpm acousticness_. instrumentalness_. liveness_.
0.0587456970 0.0413240023 0.0410748362 0.0327164251 0.0013286969 -0.0009670221 -0.0387277529
energy_. valence_. danceability_. speechiness_. artist_count released_year
-0.0496657926 -0.0584550791 -0.0754316227 -0.0907281501 -0.1090468634 -0.1483509269
model <- lm(streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists + danceability_. + energy_. + valence_., data = spotify)
summary(model)
Call:
lm(formula = streams ~ in_spotify_playlists + in_deezer_playlists +
in_apple_playlists + danceability_. + energy_. + valence_.,
data = spotify)
Residuals:
Min 1Q Median 3Q Max
-1.068e+09 -1.246e+08 -3.334e+07 9.771e+07 1.301e+09
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 182491364 45594165 4.003 6.80e-05 ***
in_spotify_playlists 42130 3557 11.844 < 2e-16 ***
in_deezer_playlists 566032 74514 7.596 7.90e-14 ***
in_apple_playlists 1507560 189228 7.967 5.11e-15 ***
danceability_. 223598 592024 0.378 0.7058
energy_. -717839 509189 -1.410 0.1590
valence_. -637129 379394 -1.679 0.0934 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 229500000 on 866 degrees of freedom
(80 observations deleted due to missingness)
Multiple R-squared: 0.6886, Adjusted R-squared: 0.6864
F-statistic: 319.2 on 6 and 866 DF, p-value: < 2.2e-16
# Refitting the model with only significant predictors
refined_model <- lm(streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists, data = spotify)
# Summary of the refined model
summary(refined_model)
Call:
lm(formula = streams ~ in_spotify_playlists + in_deezer_playlists +
in_apple_playlists, data = spotify)
Residuals:
Min 1Q Median 3Q Max
-1.082e+09 -1.173e+08 -3.519e+07 9.715e+07 1.311e+09
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 119309688 10537578 11.322 < 2e-16 ***
in_spotify_playlists 43264 3526 12.271 < 2e-16 ***
in_deezer_playlists 565426 74598 7.580 8.88e-14 ***
in_apple_playlists 1427508 186160 7.668 4.67e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.3e+08 on 869 degrees of freedom
(80 observations deleted due to missingness)
Multiple R-squared: 0.686, Adjusted R-squared: 0.6849
F-statistic: 632.9 on 3 and 869 DF, p-value: < 2.2e-16
# Refit the model with complete cases only
data_complete <- spotify %>%
select(streams, in_spotify_playlists, in_deezer_playlists, in_apple_playlists) %>%
na.omit()
model <- lm(streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists, data = data_complete)
# Add predictions to the complete data
# Predict streams
predicted_streams <- predict(model, newdata = data_complete)
# Add predictions to the data frame
data_complete$predicted_streams <- predicted_streams
data_complete
# Plot actual vs predicted
ggplot(data_complete, aes(x = streams, y = predicted_streams)) +
geom_point(alpha = 0.6, color = "steelblue") +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(title = "Actual vs Predicted Streams",
x = "Actual Streams",
y = "Predicted Streams") +
theme_minimal()
# Plot residuals
residuals <- model$residuals
View(data_complete)
ggplot(data_complete, aes(x = predicted_streams, y = residuals)) +
geom_point(alpha = 0.6, color = "darkorange") +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residual Plot",
x = "Predicted Streams",
y = "Residuals") +
theme_minimal()
# Step 1: Load required libraries
library(caret)
G2;H2;Warningh: package ‘caret’ was built under R version 4.3.3g
G3;Loading required package: lattice
g
# Step 2: Set seed for reproducibility
set.seed(123)
# Step 3: Define training control for 10-fold cross-validation
train_control <- trainControl(method = "cv", number = 10)
# Step 4: Define the model formula (same predictors as before)
model_formula <- streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists
# Step 5: Fit the linear regression model using caret::train()
cv_model <- train(
model_formula,
data = data_complete,
method = "lm",
trControl = train_control
)
# Step 6: Review cross-validation results
print(cv_model)
Linear Regression
873 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 785, 786, 785, 786, 786, 787, ...
Resampling results:
RMSE Rsquared MAE
231655068 0.6849981 161789134
Tuning parameter 'intercept' was held constant at a value of TRUE
# Optional: Plot predictions vs. actuals again using cv_model$finalModel if desired
# Fit the final model on full data
final_model <- train(
streams ~ in_spotify_playlists + in_deezer_playlists + in_apple_playlists,
data = data_complete,
method = "lm"
)
# View final coefficients
coef(final_model$finalModel)
(Intercept) in_spotify_playlists in_deezer_playlists in_apple_playlists
119309687.75 43263.74 565425.67 1427508.47
#Create a new data frame with predictor values
# Replace these numbers with your actual input values
new_input <- data.frame(
in_spotify_playlists = 2500,
in_deezer_playlists = 50,
in_apple_playlists = 250
)
# 3. Predict streams based on new inputs
predicted_streams <- predict(cv_model, newdata = new_input)
# View prediction
predicted_streams
1
612617449
# Load required packages
library(shiny)
# Define UI
ui <- fluidPage(
titlePanel("Predict Song Streams"),
sidebarLayout(
sidebarPanel(
numericInput("spotify", "Spotify Playlists:", value = 5000, min = 0),
numericInput("deezer", "Deezer Playlists:", value = 1000, min = 0),
numericInput("apple", "Apple Playlists:", value = 2000, min = 0),
actionButton("predict", "Predict Streams")
),
mainPanel(
h3("Predicted Streams:"),
verbatimTextOutput("prediction")
)
)
)
# Define server logic
server <- function(input, output) {
# Reactive prediction
observeEvent(input$predict, {
new_input <- data.frame(
in_spotify_playlists = input$spotify,
in_deezer_playlists = input$deezer,
in_apple_playlists = input$apple
)
predicted <- predict(cv_model, newdata = new_input)
output$prediction <- renderText({
format(round(predicted, 0), big.mark = ",")
})
})
}
# Run the application
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:6557
grsession-arm64(40396) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
rsession-arm64(40397) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
rsession-arm64(40398) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
rsession-arm64(40399) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
rsession-arm64(40400) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
rsession-arm64(40401) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
?cv_model
No documentation for ‘cv_model’ in specified packages and libraries:
you could try ‘??cv_model’
cv_model
Linear Regression
873 samples
3 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 785, 786, 785, 786, 786, 787, ...
Resampling results:
RMSE Rsquared MAE
231655068 0.6849981 161789134
Tuning parameter 'intercept' was held constant at a value of TRUE
# Assume `final_model` is already trained with lm()
colSums(is.na(data_complete))
streams in_spotify_playlists in_deezer_playlists in_apple_playlists predicted_streams
0 0 0 0 0
# Then you need to extract the final linear model from the `train` object
# before using it for prediction with confidence and prediction intervals
lm_model <- cv_model$finalModel
# Now you can safely use predict with interval = "confidence" and "prediction"
pred_conf <- predict(lm_model, newdata = data_complete, interval = "confidence")
pred_pred <- predict(lm_model, newdata = data_complete, interval = "prediction")
# Combine everything into a data frame
plot_data <- data_complete %>%
mutate(
predicted_streams = pred_conf[, "fit"],
conf_low = pred_conf[, "lwr"],
conf_high = pred_conf[, "upr"],
pred_low = pred_pred[, "lwr"],
pred_high = pred_pred[, "upr"]
)
# Plot
ggplot(plot_data, aes(x = streams, y = predicted_streams)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "blue") +
geom_ribbon(aes(ymin = conf_low, ymax = conf_high), fill = "lightblue", alpha = 0.3) +
geom_ribbon(aes(ymin = pred_low, ymax = pred_high), fill = "orange", alpha = 0.2) +
labs(
title = "Predicted vs Actual Streams with Confidence and Prediction Intervals",
x = "Actual Streams",
y = "Predicted Streams"
) +
theme_minimal()
# First, ensure that you have your predictions with intervals set up properly
# Extract the linear model from caret's train object
lm_model <- cv_model$finalModel
# Generate predictions with both confidence and prediction intervals
pred_conf <- predict(lm_model, newdata = data_complete, interval = "confidence")
pred_pred <- predict(lm_model, newdata = data_complete, interval = "prediction")
# Combine everything into a data frame
plot_data <- data_complete %>%
mutate(
predicted_streams = pred_conf[, "fit"],
conf_low = pred_conf[, "lwr"],
conf_high = pred_conf[, "upr"],
pred_low = pred_pred[, "lwr"],
pred_high = pred_pred[, "upr"]
) %>%
arrange(streams) # sort by actual streams for smooth ribbons
# Plot with ggplot2
ggplot(plot_data, aes(x = predicted_streams, y = streams)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "blue") +
geom_ribbon(aes(ymin = conf_low, ymax = conf_high), fill = "lightblue", alpha = 0.3) +
geom_ribbon(aes(ymin = pred_low, ymax = pred_high), fill = "orange", alpha = 0.2) +
labs(
title = "Predicted vs Actual Streams with Confidence and Prediction Intervals",
x = "Predicted Streams",
y = "Actual Streams"
) +
theme_minimal()
# Plot with confidence and prediction intervals as lines (no shaded ribbons)
ggplot(plot_data, aes(x = predicted_streams, y = streams)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_abline(intercept = 0, slope = 1, linetype = "solid", color = "blue") +
geom_line(aes(y = conf_low), color = "purple", linetype = "dashed") +
geom_line(aes(y = conf_high), color = "purple", linetype = "dashed") +
geom_line(aes(y = pred_low), color = "red", linetype = "dotted") +
geom_line(aes(y = pred_high), color = "red", linetype = "dotted") +
labs(
title = "Actual vs Predicted Streams with Confidence and Prediction Interval Lines (R^2 = 0.68)",
x = "Predicted Streams",
y = "Actual Streams"
) +
theme_minimal()
ggplot(plot_data, aes(x = in_spotify_playlists, y = streams)) +
geom_point() +
geom_line(aes(y = predicted_streams), color = "blue") +
geom_ribbon(aes(ymin = conf_low, ymax = conf_high), alpha = 0.2) +
labs(title = "Model Fit with Confidence Interval",
y = "Streams", x = "Spotify Playlists") +
theme_minimal()
# Load required packages
library(shiny)
# Combine Visuals/Define UI
ui <- fluidPage(
titlePanel("Predict Spotify Song Streams"),
tabsetPanel(
tabPanel("Visualize by Mode",
sidebarLayout(
sidebarPanel(
checkboxGroupInput("selected_modes", "Select Mode(s):",
choices = unique(spotify$mode),
selected = unique(spotify$mode))
),
mainPanel(
plotOutput("modePlot")
)
)
),
tabPanel("Predict Streams",
sidebarLayout(
sidebarPanel(
numericInput("spotify", "Spotify Playlists:", value = 5000, min = 0),
numericInput("deezer", "Deezer Playlists:", value = 1000, min = 0),
numericInput("apple", "Apple Playlists:", value = 2000, min = 0),
actionButton("predict", "Predict Streams")
),
mainPanel(
h3("Predicted Streams:"),
verbatimTextOutput("prediction")
)
)
)
)
)
# Define server logic
server <- function(input, output) {
# Reactive prediction
observeEvent(input$predict, {
new_input <- data.frame(
in_spotify_playlists = input$spotify,
in_deezer_playlists = input$deezer,
in_apple_playlists = input$apple
)
predicted <- predict(final_model, newdata = new_input)
output$prediction <- renderText({
format(round(predicted, 0), big.mark = ",")
})
})
output$modePlot <- renderPlot({
req(input$selected_modes)
filtered_data <- subset(spotify, mode %in% input$selected_modes)
ggplot(filtered_data, aes(x = streams, y = in_spotify_playlists, color = mode)) +
geom_point() +
labs(title = "Streams vs Playlist Metrics by Mode",
x = "Streams",
y = "Number in Spotify Playlists") +
theme_minimal()
})
}
# Run the application
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:6557
g
#single variable
ggplot(plot_data, aes(x = predicted_streams, y = in_spotify_playlists)) +
geom_point(aes(y = streams), alpha = 0.5) +
geom_line() +
geom_ribbon(aes(ymin = conf_low, ymax = conf_high), alpha = 0.2) +
labs(title = "Prediction with Confidence Intervals")
plot(lm_model)
plot(lm_model, which = 5)
# Calculate Cook's Distance
cooksD <- cooks.distance(lm_model)
# Set a common threshold (4 / n)
threshold <- 4 / nrow(data_complete)
# Find influential points
influential_points <- which(cooksD > threshold)
influential_points
X15 X16 X23 X42 X43 X48 X52 X58 X62 X72 X75 X85 X88 X111 X116 X122 X133 X147 X153 X155 X159 X160 X165 X167 X168 X170 X171 X181 X185 X187
15 16 23 42 43 48 51 55 59 68 70 78 80 98 102 108 116 127 132 134 137 138 142 144 145 146 147 152 155 157
X188 X193 X232 X240 X304 X321 X366 X367 X369 X379 X393 X396 X411 X423 X425 X426 X434 X444 X455 X458 X461 X463 X467 X470 X472 X496 X506 X508 X511 X514
158 161 199 207 267 284 325 326 328 338 352 355 367 379 381 382 390 397 408 411 414 416 420 423 425 449 459 461 464 467
X520 X531 X536 X556 X558 X563 X566 X567 X576 X582 X585 X592 X599 X600 X614 X617 X620 X622 X639 X658 X675 X719 X720 X740 X765 X841 X857 X864 X900 X903
473 484 489 509 511 516 519 520 528 534 537 544 550 551 564 566 569 570 583 598 610 650 651 667 688 764 780 787 821 824
X912
832
# Create a new dataset excluding influential rows
data_no_influential <- data_complete[-influential_points, ]
data_no_influential
# Refit the model using caret with cross-validation
cv_model_clean <- train(
model_formula,
data = data_no_influential,
method = "lm",
trControl = trainControl(method = "cv", number = 10)
)
cv_model$results # Original model
cv_model_clean$results # Model without influential points
model_full <- lm(model_formula, data = data_complete)
model_reduced <- lm(model_formula, data = data_no_influential)
AIC(model_full, model_reduced)
G2;H2;Warningh in AIC.default(model_full, model_reduced) :
models are not all fitted to the same number of observationsg
BIC(model_full, model_reduced)
G2;H2;Warningh in BIC.default(model_full, model_reduced) :
models are not all fitted to the same number of observationsg
# First, ensure that you have your predictions with intervals set up properly
# Extract the linear model from caret's train object
lm_model_clean <- cv_model_clean$finalModel
# Generate predictions with both confidence and prediction intervals
pred_conf_clean <- predict(lm_model_clean, newdata = data_no_influential, interval = "confidence")
pred_pred_clean <- predict(lm_model_clean, newdata = data_no_influential, interval = "prediction")
# Combine everything into a data frame
plot_data_clean <- data_no_influential %>%
mutate(
predicted_streams = pred_conf_clean[, "fit"],
conf_low = pred_conf_clean[, "lwr"],
conf_high = pred_conf_clean[, "upr"],
pred_low = pred_pred_clean[, "lwr"],
pred_high = pred_pred_clean[, "upr"]
) %>%
arrange(streams) # sort by actual streams for smooth ribbons
# Plot with confidence and prediction intervals as lines (no shaded ribbons)
ggplot(plot_data_clean, aes(x = predicted_streams, y = streams)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_abline(intercept = 0, slope = 1, linetype = "solid", color = "blue") +
geom_line(aes(y = conf_low), color = "purple", linetype = "dashed") +
geom_line(aes(y = conf_high), color = "purple", linetype = "dashed") +
geom_line(aes(y = pred_low), color = "red", linetype = "dotted") +
geom_line(aes(y = pred_high), color = "red", linetype = "dotted") +
labs(
title = "Actual vs Predicted Streams with Confidence and Prediction Interval Lines",
x = "Predicted Streams",
y = "Actual Streams"
) +
theme_minimal()
library(shiny)
library(dplyr)
library(ggplot2)
library(scales)
library(fmsb)
# Define UI
ui <- navbarPage(
title = div(
#img(src = "www/spotify_logo_black.png", height = "30px", style = "margin-top:-5px;"),
"Spotify Data Dashboard"
),
id = "main_navbar",
# Custom styling
header = tags$head(
tags$style(HTML("
/* Navbar background */
.navbar-default {
background-color: #191414;
border-color: #191414;
}
/* Navbar title and tab text */
.navbar-default .navbar-brand,
.navbar-default .navbar-nav > li > a {
color: white !important;
font-weight: bold;
}
/* Active tab highlight */
.navbar-default .navbar-nav > .active > a,
.navbar-default .navbar-nav > .active > a:focus,
.navbar-default .navbar-nav > .active > a:hover {
background-color: #1DB954 !important;
color: black !important;
}
/* Tab content background */
.tab-pane {
background-color: #121212;
color: white;
padding: 30px;
}
/* Body background */
body {
background-color: #121212;
color: white;
}
/* List styling for better visibility */
ul {
color: white;
}
"))
),
# --- HOME TAB ---
tabPanel("Home",
fluidPage(
tags$div(
style = "text-align: center;",
#img(src = "www/spotify_logo_green.png", height = "120px"),
h2("Welcome to the Spotify Data Dashboard"),
p("This data includes songs released from 1930 - 2023. Use the tabs above to explore data visualizations and models related to Spotify songs."),
tags$ul(
tags$li("Explore trends across release years"),
tags$li("Discover top streamed tracks"),
tags$li("Visualize audio features with radar and density plots"),
tags$li("Compare playlists and stream predictions")
),
p("Use the tabs above to explore the features.")
)
)
),
# Group: Release Year Performance
tabPanel("Release Year Performance",
tabsetPanel(
tabPanel("Density: Release Year",
plotOutput("densityPlot")
),
tabPanel("Top 10 Streamed Songs by Year",
plotOutput("top10Bar")
),
tabPanel("Top 5 Songs by Year",
sidebarLayout(
sidebarPanel(
selectInput("year", "Select Year:", choices = sort(unique(yearly_top_songs$released_year), decreasing = TRUE))
),
mainPanel(
plotOutput("topSongsPlot")
)
)
)
)
),
# Group: Audio Features
tabPanel("Radar: Top 3 Songs Audio Features",
sidebarLayout(
sidebarPanel(
selectInput("selected_year", "Choose a Year:",
choices = sort(unique(spotify$released_year), decreasing = TRUE),
selected = max(spotify$released_year, na.rm = TRUE))
),
mainPanel(
plotOutput("radarPlot")
)
)
),
# Group: Mode & Playlist Insights
tabPanel("Scatter: Streams vs Playlists by Mode",
sidebarLayout(
sidebarPanel(
checkboxGroupInput("mode_select", "Select Mode(s):",
choices = unique(spotify$mode),
selected = unique(spotify$mode))
),
mainPanel(
plotOutput("scatterPlot")
)
)
),
# Group: Playlist Metrics & Predictions
tabPanel("Playlists & Streams Relationship",
tabsetPanel(
tabPanel("Pairs Plot",
plotOutput("pairsPlot")
),
tabPanel("Model Predictions",
fluidRow(
column(6,
h4("Predict Streams"),
numericInput("spotify", "Spotify Playlists:", value = 5000, min = 0),
numericInput("deezer", "Deezer Playlists:", value = 1000, min = 0),
numericInput("apple", "Apple Playlists:", value = 2000, min = 0),
actionButton("predict", "Predict Streams"),
br(), br(),
h5("Predicted Streams:"),
verbatimTextOutput("prediction")
),
column(6,
h4("Actual vs Predicted Streams"),
plotOutput("predictionPlot")
)
)
)
)
)
)
# Define Server
server <- function(input, output, session) {
# Density Plot ----
output$densityPlot <- renderPlot({
plot(density(spotify$released_year, na.rm = TRUE),
main = "Density Plot of Released Year",
xlab = "Released Year", col = "blue", lwd = 2)
})
# Scatter Plot ----
output$scatterPlot <- renderPlot({
filtered_data <- spotify[spotify$mode %in% input$mode_select, ]
ggplot(filtered_data, aes(x = streams, y = in_spotify_playlists, color = mode)) +
geom_point() +
labs(title = "Streams vs Playlist Metrics by Mode",
x = "Streams", y = "Number in Spotify Playlists") +
theme_minimal()
})
# Top 10 Streamed Songs by Year ----
output$top10Bar <- renderPlot({
ggplot(top10_yearly, aes(x = factor(released_year), y = streams, fill = song.artist)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Streamed Songs By Year",
x = "Year", y = "Streams", fill = "Song - Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
})
# Top 5 Songs per Year ----
output$topSongsPlot <- renderPlot({
selected_year_data <- yearly_top_songs %>%
filter(released_year == input$year)
selected_year_data$track_name <- iconv(selected_year_data$track_name, from = "UTF-8", to = "UTF-8", sub = "*")
ggplot(selected_year_data, aes(x = reorder(track_name, -streams), y = streams, fill = song.artist)) +
geom_bar(stat = "identity") +
labs(title = paste("Top 5 Streamed Songs in", input$year),
x = "Song", y = "Streams", fill = "Song - Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
})
# Radar Chart ----
radar_matrix3 <- reactive({
req(input$selected_year)
top3_year <- spotify %>%
filter(released_year == input$selected_year) %>%
slice_max(order_by = streams, n = 3, with_ties = FALSE) %>%
dplyr::rename(
danceability = `danceability_.`,
speechiness = `speechiness_.`,
energy = `energy_.`,
acousticness = `acousticness_.`
) %>%
dplyr::select(song.artist, bpm, danceability, speechiness, energy, acousticness)
radar_data_norm3 <- top3_year %>%
mutate(across(where(is.numeric), ~ rescale(.x, to = c(0, 1))))
if (nrow(radar_data_norm3) < 1) return(NULL)
max_min3 <- data.frame(
bpm = 1, danceability = 1, speechiness = 1, energy = 1, acousticness = 1,
row.names = c("Max")
) %>%
bind_rows(data.frame(
bpm = 0, danceability = 0, speechiness = 0, energy = 0, acousticness = 0,
row.names = c("Min")
))
bind_rows(max_min3, radar_data_norm3 %>% column_to_rownames("song.artist"))
})
output$radarPlot <- renderPlot({
matrix <- radar_matrix3()
req(matrix)
colors_border <- rainbow(nrow(matrix) - 2)
colors_in <- adjustcolor(colors_border, alpha.f = 0.25)
layout(matrix(c(1, 2), nrow = 2), heights = c(4, 1))
par(mar = c(2, 2, 4, 2))
radarchart(
matrix,
axistype = 1,
pcol = colors_border,
pfcol = colors_in,
plwd = 2,
plty = 1,
cglcol = "grey",
cglty = 1,
axislabcol = "grey",
caxislabels = seq(0, 1, 0.2),
cglwd = 0.8,
vlcex = 0.9,
title = paste("Top 3 Songs in", input$selected_year)
)
par(mar = c(0, 0, 0, 0))
plot.new()
legend("center", legend = rownames(matrix)[-c(1, 2)],
bty = "n", pch = 20, col = colors_border, text.col = "black", cex = 0.9)
})
# Pairs Plot ----
output$pairsPlot <- renderPlot({
selected_data <- dplyr::select(spotify, in_spotify_playlists, in_deezer_playlists, in_apple_playlists, streams)
pairs(selected_data, main = "Pairs Plot of Playlist Counts and Streams")
})
# Actual vs Predicted Plot ----
output$predictionPlot <- renderPlot({
ggplot(plot_data, aes(x = predicted_streams, y = streams)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_abline(intercept = 0, slope = 1, linetype = "solid", color = "blue") +
geom_line(aes(y = conf_low), color = "purple", linetype = "dashed") +
geom_line(aes(y = conf_high), color = "purple", linetype = "dashed") +
geom_line(aes(y = pred_low), color = "red", linetype = "dotted") +
geom_line(aes(y = pred_high), color = "red", linetype = "dotted") +
labs(title = "Actual vs Predicted Streams (R^2 = 0.68)",
x = "Predicted Streams", y = "Actual Streams") +
theme_minimal()
})
# Predict Streams ----
observeEvent(input$predict, {
new_input <- data.frame(
in_spotify_playlists = input$spotify,
in_deezer_playlists = input$deezer,
in_apple_playlists = input$apple
)
predicted <- predict(cv_model, newdata = new_input)
output$prediction <- renderText({
format(round(predicted, 0), big.mark = ",")
})
})
}
# Run the App
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:6557
grsession-arm64(40420) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
rsession-arm64(40421) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
rsession-arm64(40422) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
rsession-arm64(40423) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
rsession-arm64(40424) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
rsession-arm64(40425) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
ui <- fluidPage(
img(src = "spotify_logo_black.png", height = "100px")
)
server <- function(input, output) {}
shinyApp(ui, server)
G3;
Listening on http://127.0.0.1:6557
g
file.exists("www:spotify_logo_black.png")
[1] FALSE
list.files("www")
character(0)
getwd()
[1] "/Users/ryanoka/Documents/DS Bootcamp/R/R-Project"
G1;H1;Errorh: unexpected ‘<’ in “<” Error during wrapup: not that many frames on the stack Error: no more error handlers available (recursive errors?); invoking ‘abort’ restart g
list.files("www/")
[1] "spotify_logo_black.png" "spotify_logo_green.png"
title: "Spotify Dashboard"
output:
flexdashboard::flex_dashboard:
orientation: columns
runtime: shiny
img(src = "spotify_logo_black.png", height = "30px")
YAML
G1;H1;Errorh: object 'YAML' not found
gG1;H1;Error during wrapuph: not that many frames on the stack
Error: no more error handlers available (recursive errors?); invoking 'abort' restart
g
---
title: "Spotify Data Dashboard"
G1;H1;Errorh in -title : invalid argument to unary operator
gG1;H1;Error during wrapuph: not that many frames on the stack
Error: no more error handlers available (recursive errors?); invoking 'abort' restart
g
output: html_document
G1;H1;Errorh: object 'output' not found
Error during wrapup: not that many frames on the stack
Error: no more error handlers available (recursive errors?); invoking 'abort' restart
g
runtime: shiny
G1;H1;Errorh: object 'runtime' not found
gG1;H1;Error during wrapuph: not that many frames on the stack
Error: no more error handlers available (recursive errors?); invoking 'abort' restart
g
---
shiny::addResourcePath("www", "www")
G1;H1;Errorh in -shiny::addResourcePath("www", "www") :
invalid argument to unary operator
Error during wrapup: not that many frames on the stack
Error: no more error handlers available (recursive errors?); invoking 'abort' restart
g
# Define UI
ui <- navbarPage(
title = div(
img(src = "www/spotify_logo_green.png", height = "30px", style = "margin-top:-5px;"),
"Spotify Data Dashboard"
),
id = "main_navbar",
# Custom styling
header = tags$head(
tags$style(HTML("
/* Navbar background */
.navbar-default {
background-color: #191414;
border-color: #191414;
}
/* Navbar title (brand) text color */
.navbar-default .navbar-brand {
color: #1DB954 !important; /* Spotify green */
}
/* Navbar title hover/focus */
.navbar-default .navbar-brand:hover,
.navbar-default .navbar-brand:focus {
color: #1ed760 !important;
}
/* Tab panel (menu items) text color */
.navbar-default .navbar-nav > li > a {
color: #1DB954 !important;
}
/* Tab hover/focus */
.navbar-default .navbar-nav > li > a:hover,
.navbar-default .navbar-nav > li > a:focus {
color: #1ed760 !important;
background-color: transparent;
}
/* Keep main panel background */
.main-panel, .col-sm-8 {
background-color: #f5f5f5 !important;
color: black !important;
padding: 20px;
border-radius: 6px;
}
"))
),
# --- HOME TAB ---
tabPanel("Home",
fluidPage(
tags$div(
style = "text-align: left;",
img(src = "www/spotify_logo_black.png", height = "120px"),
h2("Welcome to the Spotify Data Dashboard"),
p("This data includes songs released from 1930 - 2023. Use the tabs above to explore data visualizations and models related to Spotify songs."),
tags$ul(
tags$li("Explore trends across release years"),
tags$li("Discover top streamed tracks"),
tags$li("Visualize audio features with radar and scatter plots"),
tags$li("Compare playlists and stream predictions")
),
p("Use the tabs above to explore the features.")
)
)
),
# Group: Release Year Performance
tabPanel("Release Year Performance",
tabsetPanel(
tabPanel("Density: Release Year",
plotOutput("densityPlot")
),
tabPanel("Top 10 Overall - Top Streamed Song From Each Released Year",
plotOutput("top10Bar"),
p('Filtering for the top streamed from each released year and then displaying which songs ranked in top 10 by overall stream counts.')
),
tabPanel("Top 5 Songs by Year",
sidebarLayout(
sidebarPanel(
selectInput("year", "Select Year:", choices = sort(unique(yearly_top_songs$released_year), decreasing = TRUE))
),
mainPanel(
plotOutput("topSongsPlot")
)
)
)
)
),
# Group: Audio Features
tabPanel("Radar: Top 3 Songs Audio Features",
sidebarLayout(
sidebarPanel(
selectInput("selected_year", "Choose a Year:",
choices = sort(unique(spotify$released_year), decreasing = TRUE),
selected = max(spotify$released_year, na.rm = TRUE)),
tags$ul(
style = "padding-left: 15px; margin-left: 0;", # Adjust padding as needed
tags$li("BPM - Beats per minute, tempo or speed of the song. "),
tags$li("Acousticness - Confidence measure of how much a track sounds like it was made with live instruments and natural sounds, rather than electronic or synthesized sounds. "),
tags$li("Energy - Encompasses a broader range of elements, including momentum, intensity, and emotional impact."),
tags$li("Speechiness - Measure of how much spoken word content is present in a track, as opposed to purely musical elements."),
tags$li("Danceability - Measure of how easily someone could move their body to the rhythm and structure of the music."),
)
),
mainPanel(
plotOutput("radarPlot")
)
)
),
# Group: Playlist Metrics & Predictions
tabPanel("Playlists & Streams Relationship",
tabsetPanel(
tabPanel("Scatter: Streams vs Playlists by Mode",
sidebarLayout(
sidebarPanel(
checkboxGroupInput("mode_select", "Select Mode(s):",
choices = unique(spotify$mode),
selected = unique(spotify$mode)),
tags$ul(
style = "padding-left: 15px; margin-left: 0;", # Adjust padding as needed
tags$li("Modes refer to two different types of musical scales and keys."),
tags$li("Major - Sounds bright, happy, and uplifting."),
tags$li("Minor - Sounds sad, melancholic, or dark."),
)
),
mainPanel(
plotOutput("scatterPlot"),
)
)
),
tabPanel("Pairs Plot",
plotOutput("pairsPlot"),
p('The pairs plot is showing the linear relationship between the number of streams to the number of playlists songs appear in. Spotify, Deezer, and Apple plalylist appearances had the most linear relationship to streams.')
),
tabPanel("Model Predictions",
fluidRow(
column(6,
p('Input an estimate for the number of appearances in each of the playlists in order to predict the number of streams. The model is trained on this data to predict the number of total streams (2008-2024).
'),
h4("Predict Streams"),
numericInput("spotify", "Spotify Playlists:", value = 5000, min = 0),
numericInput("deezer", "Deezer Playlists:", value = 1000, min = 0),
numericInput("apple", "Apple Playlists:", value = 2000, min = 0),
actionButton("predict", "Predict Streams"),
br(), br(),
h5("Predicted Streams:"),
verbatimTextOutput("prediction")
),
column(6,
h4("Actual vs Predicted Streams"),
plotOutput("predictionPlot"),
p('The linear model displayed is trained on streams in relation to Spotify, Deezer, and Apple playlist appearances. Using data from each song, it is plotting the actual stream numbers to the predicted value with confidence and predictive intervals.')
)
)
)
)
)
)
# Define Server
server <- function(input, output, session) {
# Density Plot ----
output$densityPlot <- renderPlot({
plot(density(spotify$released_year, na.rm = TRUE),
main = "Density Plot of Song Counts by Released Year",
xlab = "Released Year", col = "blue", lwd = 2)
})
# Scatter Plot ----
output$scatterPlot <- renderPlot({
filtered_data <- spotify[spotify$mode %in% input$mode_select, ]
ggplot(filtered_data, aes(x = streams, y = in_spotify_playlists, color = mode)) +
geom_point() +
labs(title = "Streams vs Playlist Metrics by Mode",
x = "Streams", y = "Number in Spotify Playlists") +
theme_minimal()
})
# Top 10 Streamed Songs by Year ----
output$top10Bar <- renderPlot({
ggplot(top10_yearly, aes(x = factor(released_year), y = streams, fill = song.artist)) +
geom_bar(stat = "identity") +
labs(title = "Top Streamed Song For Each Released Year: Top 10 Ranking Overall",
x = "Year", y = "Streams", fill = "Song - Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
})
# Top 5 Songs per Year ----
output$topSongsPlot <- renderPlot({
selected_year_data <- yearly_top_songs %>%
filter(released_year == input$year)
selected_year_data$track_name <- iconv(selected_year_data$track_name, from = "UTF-8", to = "UTF-8", sub = "*")
ggplot(selected_year_data, aes(x = reorder(track_name, -streams), y = streams, fill = song.artist)) +
geom_bar(stat = "identity") +
labs(title = paste("Top 5 Streamed Songs in", input$year),
x = "Song", y = "Streams", fill = "Song - Artist") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
})
# Radar Chart ----
radar_matrix3 <- reactive({
req(input$selected_year)
top3_year <- spotify %>%
filter(released_year == input$selected_year) %>%
slice_max(order_by = streams, n = 3, with_ties = FALSE) %>%
dplyr::rename(
danceability = `danceability_.`,
speechiness = `speechiness_.`,
energy = `energy_.`,
acousticness = `acousticness_.`
) %>%
dplyr::select(song.artist, bpm, danceability, speechiness, energy, acousticness)
radar_data_norm3 <- top3_year %>%
mutate(across(where(is.numeric), ~ rescale(.x, to = c(0, 1))))
if (nrow(radar_data_norm3) < 1) return(NULL)
max_min3 <- data.frame(
bpm = 1, danceability = 1, speechiness = 1, energy = 1, acousticness = 1,
row.names = c("Max")
) %>%
bind_rows(data.frame(
bpm = 0, danceability = 0, speechiness = 0, energy = 0, acousticness = 0,
row.names = c("Min")
))
bind_rows(max_min3, radar_data_norm3 %>% column_to_rownames("song.artist"))
})
output$radarPlot <- renderPlot({
matrix <- radar_matrix3()
req(matrix)
colors_border <- rainbow(nrow(matrix) - 2)
colors_in <- adjustcolor(colors_border, alpha.f = 0.25)
layout(matrix(c(1, 2), nrow = 2), heights = c(4, 1))
par(mar = c(2, 2, 4, 2))
radarchart(
matrix,
axistype = 1,
pcol = colors_border,
pfcol = colors_in,
plwd = 2,
plty = 1,
cglcol = "grey",
cglty = 1,
axislabcol = "grey",
caxislabels = seq(0, 1, 0.2),
cglwd = 0.8,
vlcex = 0.9,
title = paste("Top 3 Songs in", input$selected_year)
)
par(mar = c(0, 0, 0, 0))
plot.new()
legend("center", legend = rownames(matrix)[-c(1, 2)],
bty = "n", pch = 20, col = colors_border, text.col = "black", cex = 0.9)
})
# Pairs Plot ----
output$pairsPlot <- renderPlot({
selected_data <- dplyr::select(spotify, in_spotify_playlists, in_deezer_playlists, in_apple_playlists, streams)
pairs(selected_data, main = "Pairs Plot of Playlist Counts and Streams")
})
# Actual vs Predicted Plot ----
output$predictionPlot <- renderPlot({
ggplot(plot_data, aes(x = predicted_streams, y = streams)) +
geom_point(alpha = 0.6, color = "darkblue") +
geom_abline(intercept = 0, slope = 1, linetype = "solid", color = "blue") +
geom_line(aes(y = conf_low), color = "purple", linetype = "dashed") +
geom_line(aes(y = conf_high), color = "purple", linetype = "dashed") +
geom_line(aes(y = pred_low), color = "red", linetype = "dotted") +
geom_line(aes(y = pred_high), color = "red", linetype = "dotted") +
labs(title = "Actual vs Predicted Streams (R^2 = 0.68)",
x = "Predicted Streams", y = "Actual Streams") +
theme_minimal()
})
# Predict Streams ----
observeEvent(input$predict, {
new_input <- data.frame(
in_spotify_playlists = input$spotify,
in_deezer_playlists = input$deezer,
in_apple_playlists = input$apple
)
predicted <- predict(cv_model, newdata = new_input)
output$prediction <- renderText({
format(round(predicted, 0), big.mark = ",")
})
})
}
# Run the App
shinyApp(ui = ui, server = server)
G3;
Listening on http://127.0.0.1:4945
gG2;H2;Warningh: Removed 1 row containing missing values or values outside the scale range (`geom_point()`).g
rsession-arm64(34332) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.